creating a dataframe form tsv file

mygenes <- read.table('~/gene_dist_head.tsv',header = TRUE, sep = "\t")
mygenes

summary of the dataframe mygenes

summary(mygenes)
##                 transcript_type           feature             chr         
##  protein_coding         :2001054   exon       :1306656   1      : 238010  
##  nonsense_mediated_decay: 293471   CDS        : 791856   2      : 189916  
##  processed_transcript   : 173401   UTR        : 304070   17     : 166529  
##  retained_intron        : 150034   transcript : 215170   19     : 163304  
##  lincRNA                :  55928   stop_codon :  73411   3      : 159475  
##  antisense              :  45811   start_codon:  73358   11     : 157597  
##  (Other)                : 108613   (Other)    :  63791   (Other):1753481  
##      start                end           
##  Min.   :      577   Min.   :      647  
##  1st Qu.: 31698816   1st Qu.: 31700419  
##  Median : 56565463   Median : 56566763  
##  Mean   : 73148763   Mean   : 73152067  
##  3rd Qu.:108204790   3rd Qu.:108206944  
##  Max.   :249230780   Max.   :249231242  
## 
genes_summary <- data.frame(unclass(summary(mygenes)), check.names = FALSE, stringsAsFactors = FALSE, row.names = NULL)
genes_summary

information about our columns and rows

colnames(mygenes)
## [1] "transcript_type" "feature"         "chr"             "start"          
## [5] "end"
head(rownames(mygenes), 50)
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14"
## [15] "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28"
## [29] "29" "30" "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42"
## [43] "43" "44" "45" "46" "47" "48" "49" "50"

subsetting data

print(head(mygenes[,c(2,4)], 50))
##       feature start
## 1        gene 11869
## 2  transcript 11869
## 3        exon 11869
## 4        exon 12613
## 5        exon 13221
## 6  transcript 11872
## 7        exon 11872
## 8        exon 12613
## 9        exon 13225
## 10 transcript 11874
## 11       exon 11874
## 12       exon 12595
## 13       exon 13403
## 14       exon 13661
## 15 transcript 12010
## 16       exon 12010
## 17       exon 12179
## 18       exon 12613
## 19       exon 12975
## 20       exon 13221
## 21       exon 13453
## 22       gene 14363
## 23 transcript 14363
## 24       exon 29321
## 25       exon 24738
## 26       exon 18268
## 27       exon 17915
## 28       exon 17602
## 29       exon 17233
## 30       exon 16854
## 31       exon 16607
## 32       exon 15904
## 33       exon 15796
## 34       exon 14970
## 35       exon 14363
## 36 transcript 14363
## 37       exon 24734
## 38       exon 18268
## 39       exon 17915
## 40       exon 17606
## 41       exon 17498
## 42       exon 17233
## 43       exon 16854
## 44       exon 14970
## 45       exon 14363
## 46 transcript 14363
## 47       exon 29321
## 48       exon 24738
## 49       exon 17915
## 50       exon 17606
print(head(mygenes[,c(2:4)], 50))
##       feature chr start
## 1        gene   1 11869
## 2  transcript   1 11869
## 3        exon   1 11869
## 4        exon   1 12613
## 5        exon   1 13221
## 6  transcript   1 11872
## 7        exon   1 11872
## 8        exon   1 12613
## 9        exon   1 13225
## 10 transcript   1 11874
## 11       exon   1 11874
## 12       exon   1 12595
## 13       exon   1 13403
## 14       exon   1 13661
## 15 transcript   1 12010
## 16       exon   1 12010
## 17       exon   1 12179
## 18       exon   1 12613
## 19       exon   1 12975
## 20       exon   1 13221
## 21       exon   1 13453
## 22       gene   1 14363
## 23 transcript   1 14363
## 24       exon   1 29321
## 25       exon   1 24738
## 26       exon   1 18268
## 27       exon   1 17915
## 28       exon   1 17602
## 29       exon   1 17233
## 30       exon   1 16854
## 31       exon   1 16607
## 32       exon   1 15904
## 33       exon   1 15796
## 34       exon   1 14970
## 35       exon   1 14363
## 36 transcript   1 14363
## 37       exon   1 24734
## 38       exon   1 18268
## 39       exon   1 17915
## 40       exon   1 17606
## 41       exon   1 17498
## 42       exon   1 17233
## 43       exon   1 16854
## 44       exon   1 14970
## 45       exon   1 14363
## 46 transcript   1 14363
## 47       exon   1 29321
## 48       exon   1 24738
## 49       exon   1 17915
## 50       exon   1 17606
print(head(mygenes[mygenes$chr == 1, c(1:5)], 50))
##                       transcript_type    feature chr start   end
## 1                          pseudogene       gene   1 11869 14412
## 2                processed_transcript transcript   1 11869 14409
## 3                processed_transcript       exon   1 11869 12227
## 4                processed_transcript       exon   1 12613 12721
## 5                processed_transcript       exon   1 13221 14409
## 6  transcribed_unprocessed_pseudogene transcript   1 11872 14412
## 7  transcribed_unprocessed_pseudogene       exon   1 11872 12227
## 8  transcribed_unprocessed_pseudogene       exon   1 12613 12721
## 9  transcribed_unprocessed_pseudogene       exon   1 13225 14412
## 10 transcribed_unprocessed_pseudogene transcript   1 11874 14409
## 11 transcribed_unprocessed_pseudogene       exon   1 11874 12227
## 12 transcribed_unprocessed_pseudogene       exon   1 12595 12721
## 13 transcribed_unprocessed_pseudogene       exon   1 13403 13655
## 14 transcribed_unprocessed_pseudogene       exon   1 13661 14409
## 15 transcribed_unprocessed_pseudogene transcript   1 12010 13670
## 16 transcribed_unprocessed_pseudogene       exon   1 12010 12057
## 17 transcribed_unprocessed_pseudogene       exon   1 12179 12227
## 18 transcribed_unprocessed_pseudogene       exon   1 12613 12697
## 19 transcribed_unprocessed_pseudogene       exon   1 12975 13052
## 20 transcribed_unprocessed_pseudogene       exon   1 13221 13374
## 21 transcribed_unprocessed_pseudogene       exon   1 13453 13670
## 22                         pseudogene       gene   1 14363 29806
## 23             unprocessed_pseudogene transcript   1 14363 29370
## 24             unprocessed_pseudogene       exon   1 29321 29370
## 25             unprocessed_pseudogene       exon   1 24738 24891
## 26             unprocessed_pseudogene       exon   1 18268 18379
## 27             unprocessed_pseudogene       exon   1 17915 18061
## 28             unprocessed_pseudogene       exon   1 17602 17742
## 29             unprocessed_pseudogene       exon   1 17233 17364
## 30             unprocessed_pseudogene       exon   1 16854 17055
## 31             unprocessed_pseudogene       exon   1 16607 16765
## 32             unprocessed_pseudogene       exon   1 15904 15947
## 33             unprocessed_pseudogene       exon   1 15796 15901
## 34             unprocessed_pseudogene       exon   1 14970 15038
## 35             unprocessed_pseudogene       exon   1 14363 14829
## 36             unprocessed_pseudogene transcript   1 14363 24886
## 37             unprocessed_pseudogene       exon   1 24734 24886
## 38             unprocessed_pseudogene       exon   1 18268 18369
## 39             unprocessed_pseudogene       exon   1 17915 18061
## 40             unprocessed_pseudogene       exon   1 17606 17742
## 41             unprocessed_pseudogene       exon   1 17498 17504
## 42             unprocessed_pseudogene       exon   1 17233 17364
## 43             unprocessed_pseudogene       exon   1 16854 17055
## 44             unprocessed_pseudogene       exon   1 14970 15038
## 45             unprocessed_pseudogene       exon   1 14363 14829
## 46             unprocessed_pseudogene transcript   1 14363 29370
## 47             unprocessed_pseudogene       exon   1 29321 29370
## 48             unprocessed_pseudogene       exon   1 24738 24891
## 49             unprocessed_pseudogene       exon   1 17915 18061
## 50             unprocessed_pseudogene       exon   1 17606 17742

number of categories in categorical data

print(levels(mygenes$chr))
##   [1] "1"                        "10"                      
##   [3] "11"                       "12"                      
##   [5] "13"                       "14"                      
##   [7] "15"                       "16"                      
##   [9] "17"                       "18"                      
##  [11] "19"                       "2"                       
##  [13] "20"                       "21"                      
##  [15] "22"                       "3"                       
##  [17] "4"                        "5"                       
##  [19] "6"                        "7"                       
##  [21] "8"                        "9"                       
##  [23] "GL000191.1"               "GL000192.1"              
##  [25] "GL000193.1"               "GL000194.1"              
##  [27] "GL000195.1"               "GL000196.1"              
##  [29] "GL000199.1"               "GL000201.1"              
##  [31] "GL000204.1"               "GL000205.1"              
##  [33] "GL000209.1"               "GL000211.1"              
##  [35] "GL000212.1"               "GL000213.1"              
##  [37] "GL000215.1"               "GL000216.1"              
##  [39] "GL000218.1"               "GL000219.1"              
##  [41] "GL000220.1"               "GL000221.1"              
##  [43] "GL000222.1"               "GL000223.1"              
##  [45] "GL000224.1"               "GL000225.1"              
##  [47] "GL000228.1"               "GL000229.1"              
##  [49] "GL000230.1"               "GL000231.1"              
##  [51] "GL000233.1"               "GL000236.1"              
##  [53] "GL000237.1"               "GL000240.1"              
##  [55] "GL000241.1"               "GL000242.1"              
##  [57] "GL000243.1"               "GL000247.1"              
##  [59] "HG1007_PATCH"             "HG1032_PATCH"            
##  [61] "HG104_HG975_PATCH"        "HG1063_PATCH"            
##  [63] "HG1074_PATCH"             "HG1079_PATCH"            
##  [65] "HG1082_HG167_PATCH"       "HG1091_PATCH"            
##  [67] "HG1133_PATCH"             "HG1146_PATCH"            
##  [69] "HG115_PATCH"              "HG1208_PATCH"            
##  [71] "HG1211_PATCH"             "HG122_PATCH"             
##  [73] "HG1257_PATCH"             "HG1287_PATCH"            
##  [75] "HG1292_PATCH"             "HG1293_PATCH"            
##  [77] "HG1304_PATCH"             "HG1308_PATCH"            
##  [79] "HG1322_PATCH"             "HG1350_HG959_PATCH"      
##  [81] "HG14_PATCH"               "HG142_HG150_NOVEL_TEST"  
##  [83] "HG1423_PATCH"             "HG1424_PATCH"            
##  [85] "HG1425_PATCH"             "HG1426_PATCH"            
##  [87] "HG1433_PATCH"             "HG1434_PATCH"            
##  [89] "HG1435_PATCH"             "HG1436_HG1432_PATCH"     
##  [91] "HG1437_PATCH"             "HG1438_PATCH"            
##  [93] "HG1439_PATCH"             "HG144_PATCH"             
##  [95] "HG1440_PATCH"             "HG1441_PATCH"            
##  [97] "HG1442_PATCH"             "HG1443_HG1444_PATCH"     
##  [99] "HG1453_PATCH"             "HG1458_PATCH"            
## [101] "HG1459_PATCH"             "HG1462_PATCH"            
## [103] "HG1463_PATCH"             "HG1472_PATCH"            
## [105] "HG1479_PATCH"             "HG1486_PATCH"            
## [107] "HG1487_PATCH"             "HG1488_PATCH"            
## [109] "HG1490_PATCH"             "HG1497_PATCH"            
## [111] "HG1500_PATCH"             "HG1501_PATCH"            
## [113] "HG1502_PATCH"             "HG151_NOVEL_TEST"        
## [115] "HG1591_PATCH"             "HG1592_PATCH"            
## [117] "HG1595_PATCH"             "HG1699_PATCH"            
## [119] "HG174_HG254_PATCH"        "HG183_PATCH"             
## [121] "HG185_PATCH"              "HG186_PATCH"             
## [123] "HG19_PATCH"               "HG193_PATCH"             
## [125] "HG237_PATCH"              "HG243_PATCH"             
## [127] "HG256_PATCH"              "HG27_PATCH"              
## [129] "HG271_PATCH"              "HG280_PATCH"             
## [131] "HG281_PATCH"              "HG29_PATCH"              
## [133] "HG299_PATCH"              "HG305_PATCH"             
## [135] "HG306_PATCH"              "HG311_PATCH"             
## [137] "HG325_PATCH"              "HG329_PATCH"             
## [139] "HG339_PATCH"              "HG344_PATCH"             
## [141] "HG348_PATCH"              "HG357_PATCH"             
## [143] "HG375_PATCH"              "HG385_PATCH"             
## [145] "HG388_HG400_PATCH"        "HG414_PATCH"             
## [147] "HG417_PATCH"              "HG418_PATCH"             
## [149] "HG444_PATCH"              "HG480_HG481_PATCH"       
## [151] "HG497_PATCH"              "HG50_PATCH"              
## [153] "HG506_HG507_HG1000_PATCH" "HG531_PATCH"             
## [155] "HG536_PATCH"              "HG544_PATCH"             
## [157] "HG686_PATCH"              "HG7_PATCH"               
## [159] "HG706_PATCH"              "HG729_PATCH"             
## [161] "HG730_PATCH"              "HG736_PATCH"             
## [163] "HG745_PATCH"              "HG747_PATCH"             
## [165] "HG748_PATCH"              "HG75_PATCH"              
## [167] "HG79_PATCH"               "HG858_PATCH"             
## [169] "HG865_PATCH"              "HG871_PATCH"             
## [171] "HG873_PATCH"              "HG883_PATCH"             
## [173] "HG905_PATCH"              "HG944_PATCH"             
## [175] "HG946_PATCH"              "HG953_PATCH"             
## [177] "HG957_PATCH"              "HG962_PATCH"             
## [179] "HG971_PATCH"              "HG979_PATCH"             
## [181] "HG987_PATCH"              "HG989_PATCH"             
## [183] "HG990_PATCH"              "HG991_PATCH"             
## [185] "HG996_PATCH"              "HG998_1_PATCH"           
## [187] "HG998_2_PATCH"            "HG999_1_PATCH"           
## [189] "HG999_2_PATCH"            "HSCHR1_1_CTG31"          
## [191] "HSCHR1_2_CTG31"           "HSCHR1_3_CTG31"          
## [193] "HSCHR10_1_CTG2"           "HSCHR10_1_CTG5"          
## [195] "HSCHR12_1_CTG1"           "HSCHR12_1_CTG2_1"        
## [197] "HSCHR12_1_CTG5"           "HSCHR12_2_CTG2"          
## [199] "HSCHR12_2_CTG2_1"         "HSCHR12_3_CTG2_1"        
## [201] "HSCHR15_1_CTG4"           "HSCHR15_1_CTG8"          
## [203] "HSCHR16_1_CTG3_1"         "HSCHR16_2_CTG3_1"        
## [205] "HSCHR17_1"                "HSCHR17_1_CTG1"          
## [207] "HSCHR17_1_CTG4"           "HSCHR17_2_CTG4"          
## [209] "HSCHR17_3_CTG4"           "HSCHR17_4_CTG4"          
## [211] "HSCHR17_5_CTG4"           "HSCHR17_6_CTG4"          
## [213] "HSCHR18_1_CTG1_1"         "HSCHR18_1_CTG2_1"        
## [215] "HSCHR18_2_CTG2"           "HSCHR18_2_CTG2_1"        
## [217] "HSCHR19_1_CTG3"           "HSCHR19_1_CTG3_1"        
## [219] "HSCHR19_2_CTG3"           "HSCHR19_3_CTG3"          
## [221] "HSCHR19LRC_COX1_CTG1"     "HSCHR19LRC_COX2_CTG1"    
## [223] "HSCHR19LRC_LRC_I_CTG1"    "HSCHR19LRC_LRC_J_CTG1"   
## [225] "HSCHR19LRC_LRC_S_CTG1"    "HSCHR19LRC_LRC_T_CTG1"   
## [227] "HSCHR19LRC_PGF1_CTG1"     "HSCHR19LRC_PGF2_CTG1"    
## [229] "HSCHR2_1_CTG1"            "HSCHR2_1_CTG12"          
## [231] "HSCHR2_2_CTG12"           "HSCHR20_1_CTG1"          
## [233] "HSCHR21_2_CTG1_1"         "HSCHR21_3_CTG1_1"        
## [235] "HSCHR21_4_CTG1_1"         "HSCHR22_1_CTG1"          
## [237] "HSCHR22_1_CTG2"           "HSCHR22_2_CTG1"          
## [239] "HSCHR3_1_CTG1"            "HSCHR3_1_CTG2_1"         
## [241] "HSCHR4_1"                 "HSCHR4_1_CTG12"          
## [243] "HSCHR4_1_CTG6"            "HSCHR4_2_CTG9"           
## [245] "HSCHR5_1_CTG1"            "HSCHR5_1_CTG2"           
## [247] "HSCHR5_1_CTG5"            "HSCHR5_2_CTG1"           
## [249] "HSCHR5_3_CTG1"            "HSCHR6_1_CTG5"           
## [251] "HSCHR6_MHC_APD"           "HSCHR6_MHC_COX"          
## [253] "HSCHR6_MHC_DBB"           "HSCHR6_MHC_MANN"         
## [255] "HSCHR6_MHC_MCF"           "HSCHR6_MHC_QBL"          
## [257] "HSCHR6_MHC_SSTO"          "HSCHR7_1_CTG6"           
## [259] "HSCHR9_1_CTG1"            "HSCHR9_1_CTG35"          
## [261] "HSCHR9_2_CTG35"           "HSCHR9_3_CTG35"          
## [263] "MT"                       "X"                       
## [265] "Y"

creating a graph with a subset of data and then making interactive plots

#install.packages("ggplot2") The intsall libraries have been commented out but the '#' symbol
#install.packages("plotly")  can be removed to install if not already installed 
library(ggplot2) # We load in libraries
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# We define a factor set of autosomes
autosomes<-c("1","2","3","4","5","6","7","8","9","10",
 "11","12","13","14","15","16","17","18","19","20","21","22")  
# We create a new dataframe called 'genes' which only has autosomes, however we still have those additional factors
genes<- mygenes[ which(mygenes$chr %in% autosomes), ] 
genes
# We remove the factors and then order them so they are numerical order and not alphabetical
genes$chr <- factor(genes$chr, levels = autosomes) 
 # We plot in ggplot 
p<-ggplot(data = genes) +  geom_bar(mapping = aes(x = chr, fill = feature), width = 1)
ggplotly(p)
ggplot(data = genes) + geom_bar(mapping = aes(x = chr, fill = feature), width = 1) + coord_polar()